********************************************************************************
* Compare RAND-ALP Bundles with CES
* Paper Figure 1.a
********************************************************************************

clear
clear matrix
set more off
set scheme s1color
estimates clear
graph drop _all
set matsize 2500
log close _all
graph set window fontface "Times New Roman"

** Set Directory
cd "../Do"
** Set Haver Directory
// set haverdir "Haver", perm

********************************************************************************
** HAVER PULL DATA
********************************************************************************
** Import Data from Consumer Expenditure Survey 
import haver cesxhfj@surveys cesxhfr@surveys cesxhff@surveys cesxhfs@surveys cesxhfe@surveys cesxv@surveys , clear
rename (cesxhfj cesxhfr cesxhff cesxhfs cesxhfe cesxv) (appliances furniture floor_cov small_appliances household_equip audio_visual)
gen ces_appliances = appliances
gen ces_appliances_furnishings = appliances + furniture + floor_cov + small_appliances + household_equip 
gen ces_appliances_furnishings_av = appliances + furniture + floor_cov + small_appliances + household_equip + audio_visual

tempfile haverd
save `haverd'

********************************************************************************
** LOAD AND CLEAN DURABLES DATA
********************************************************************************

use ../Data/matched_data_durables_jun2018_baseline.dta, clear


*labelling the 3 different durables spending variables; all are composites; one is nominal; another is "real" deflated using a single (appliances) cpi; another is "real" deflated on a good-by-good basis (in previous merge file)
la var durables "nominal durable goods spending"
la var durables_real1 "real durable goods spending, single deflator"
la var durables_real2 "real durable goods spending, separate deflators"

drop ethnicity

recode mort (5=0)
recode stocks (5=0)
recode retacct (5=0)
recode howner (5=0)

* Recode some expectations variables from the Inflation surveys. 

* unemployment dummies
recode q6 (1=1) (2 3 = 0), gen(unemp_increase)
recode q6 (3=1) (1 2 = 0), gen(unemp_decrease)

gen conditions_12m = q2a
* note everybody who says "other" is in separate category
recode q2a (1=1) (2 3 = 0), gen(conditions_12m_better)
recode q2a (2=1) (1 3 = 0), gen(conditions_12m_worse)

gen interestrate_12m = q7
recode interestrate_12m (1=1) (2 3 = 0), gen(intrate_12m_up)
recode interestrate_12m (3=1) (1 2 = 0), gen(intrate_12m_down)

gen bconditions_12m = q4
* note everybody who says "other" is in separate category
recode q4 (1=1) (2 3 = 0), gen(bconditions_12m_better)
recode q4 (2=1) (1 3 = 0), gen(bconditions_12m_worse)

* house price forecasts

gen hppoint = .
replace hppoint = 0 if q41==3
replace hppoint = q42 if q41==1
replace hppoint = -q42 if q41==2
* there are some extreme outliers (in the tens of thousands)
replace hppoint = . if abs(hppoint)>=200

la var hppoint "House price expectation"

*** Prepare some additional descriptives
* Race isn't reported in all periods. 
preserve
collapse (mean) race , by(prim_key)
sort prim_key
tempfile race
save `race'
restore

drop race
sort prim_key
merge m:1 prim_key using `race'
tab _merge
drop _merge
drop if prim_key==""

recode race (1=1) (nonmissing = 0), gen(white)
gen nonwhite = 1-white
recode gender (2=1) (1=0), gen(female)
recode highesteducation (4 9 = 0) (10/16 = 1), gen(coll)

* q31s* gives codes. Kind of odd that they've created separate variables. 
*MB: odd to generate employed variable, because everyone in sample is employed--however some also say they are retired (?) 
gen employed = . 
replace employed=1 if q31s1==1
replace employed = 0 if (q31s2==2 | q31s3==3 | q31s4==4 | q31s5==5 | q31s6==6 | q31s7==7) & q31s1!=1

* generate currently retired variable.
*some say they are retired even though all are employed; 
drop retired 
gen retired = . 
replace retired = 1 if q31s5==5
replace retired = 0 if (q31s1==1 | q31s2==2 | q31s3==3 | q31s4==4 | q31s6==6 | q31s7==7) & q31s5!=5
tab retired

gen gas_expect = . 
replace gas_expect = 0 if q47a==3
replace gas_expect = q47a_higher if q47a==1
replace gas_expect = -q47a_lower if q47a==2

*replace howner=0 if howner==.
*adds ten people to homeowner status; there are still some with howner_fix==0 who have positive mortgage payment, but missing data for has mortgage and/or mortgage amount
*two observations have howner==0 and mort==1 (say they're not a homeowner but they say they have a mortgage); not important so not recoding them for now

drop if mort==0 & amtmort!=. & amtmort>100
*fixing the mort indicator to impute plausible values for people based on other information
replace mort = 0 if (howner!=1 | mortgage==0) & mort==.
replace mort = 1 if (howner==1 | (mortgage>0 & mortgage!=.)) & mort==.

la var mort "Has Mortgage"

*MB: generating quasi-continuous income variable based on midpoint of ranges of annual household income variables (familyincome and familyincome_part2)
rename familyincome inc2
rename familyincome_part2 inc2_2
drop if inc2==.
*here is the income variable: "new_faminc"
gen new_faminc=.
replace new_faminc=2500 if inc2==1
replace new_faminc=6250 if inc2==2
replace new_faminc=8750 if inc2==3
replace new_faminc=11250 if inc2==4
replace new_faminc=13750 if inc2==5
replace new_faminc=17500 if inc2==6
replace new_faminc=22500 if inc2==7
replace new_faminc=27500 if inc2==8
replace new_faminc=32500 if inc2==9
replace new_faminc=37500 if inc2==10
replace new_faminc=45000 if inc2==11
replace new_faminc=55000 if inc2==12
replace new_faminc=67500 if inc2==13
replace new_faminc=87500 if inc2==14 & (inc2_2==1 | inc2_2==.)
*above accounts for one person with inc2==14 and inc2_2 missing; not sure why that's the case but I asigned them the lowest category of income over $75000
replace new_faminc=112500 if inc2==14 & inc2_2==2
replace new_faminc=162500 if inc2==14 & inc2_2==3
replace new_faminc=237500 if inc2==14 & inc2_2==4

gen log_new_faminc=log(new_faminc)

sort prim_key quarter
*bringing in SAMPLE WEIGHTS, to quarterly data: warning, we might lose observations if the set requiring weights has changed ; this will affect our ability to run regs using non-employed types (can only do unweighted) 
merge m:1 prim_key using ../Data/qweights_pooled.dta
*these are the reg weights, the full sample weights will still be called weight_full
*weight variable is called "weight_samp" to indicate the weights were designed for the regression sample
*134 observations dropped that didn't merge with a weight_samp
drop if _merge!=3
drop _merge

*MB December 2019: key juncture: impose different sample restrictions and use different real/nominal durables
*********************
*define regression sample before recentering any variables: dropping extreme values
*below should drop top 2 highest values of durables spending; I doubt it makes any difference 
*JIMIN: try turning on and off the following restrictions
drop if durables>20000  | durables_real1>20000 | durables_real2>20000
drop if prim_key=="5041140:1"
drop if mortgage>200000 & mortgage!=.
drop if d_inflmedian>35
drop if d_longinflmedian>35
drop if hppoint<-50
**end of optional restrictions 

*assigning locals for weights (can turn on or off in regression)
local weights "[pweight=weight_samp]"
local weights_full "[pweight=weight_full]"
local pwfile "_pw"
*drop those with missing values for regressors
la var d_inflmedian "Inflation Expectation (SR)"
la var d_infliqr "Inflation Uncertainty (SR)"
la var d_longinflmedian "Inflation Expectation (MR)"
la var d_longinfliqr "Inflation Uncertainty (MR)"
la var lag_IE "Lagged Inflation Expectation (SR)"
la var lag_infl_iqr "Lagged Infl. Uncertainty (SR)"
la var hppoint "House price expectation"
la var gas_expect "Gas price expectation"

*variable for sum of monthly payments--interactions between this variable and IE may be included in some models
gen payments=mortgage+car if howner==1
replace payments=rent+car if howner!=1
gen log_payments=.
replace log_payments=log(payments) if payments>0
replace log_payments=0 if payments==0
la var log_payments "Fixed Mnthly Paymnts (Log)"
*drop any observations with extreme value for payments (110,000): only if running a regression interacting with payments: actually none dropped here (unlike nondurables)
drop if payments>100000

*defining sample; removing those with missing values
drop if weight_samp==.
drop if d_inflmedian==.
drop if d_infliqr==.
drop if durables==.
*dropping lagged IE: only need if we include lag IE in regression 
drop if lag_IE==.
drop if lag_infl_iqr==.
*new income variable: new_faminc is recode of categorical variables familyincome and familyincome_part2; former variable was earnings last month and highly unreliable
drop if new_faminc==.
drop if intrate_12m_up==.
drop if intrate_12m_down==.
drop if unemp_increase==.
drop if unemp_decrease==.
drop if rw_expect==.
drop if d_wageiqr==.
drop if rage==.
drop if nonwhite==.
drop if female==.
drop if coll==.
drop if retired==.
drop if mort==.
*below results in loss of 300+ observations--results are robust not imposing this restriction and omitting hppoint from regressions
drop if hppoint==.
*below drops 111 observations--robustness applies again 
drop if howner==.

destring prim_key, generate(id_new) ignore(":")

*generate total durables spending within household
*add to below: use durables_real1 and durables_real2 instead--number of observations of "durables" per household will be identical to that for either "durables_real1" or "durables_real2", so no need to repeat below for alternate versions
egen tot_durables=total(durables), by(id_new)
sum tot_durables, d
*below drops 142 observations associated with households who never purchased durables under period of observation (reported zero spending on durables) 
drop if tot_durables==0
*generate variable that equals 1 in all cases, to sum to determine observations per person
gen pre_obs=1
*generate sum of observations per person
egen obs2=total(pre_obs), by(id_new)
sum obs2, d

**given the above, the command below defines the regression sample: 
*The regression sample is restricted to households having nonzero durables spending in at least one period (based on total durables spending within household) AND
*with at least 3 total quarterly observations of durable goods spending (whether zero or otherwise)  

gen durables_sample_hp=(obs2>=3)

*now generate alternative durable goods spending totals
*1. generate bundle of spending on major appliances only, not including TVs or computers (compare to CES major appliances--set as "ces_appliances" above)
local appliances "pricefridge priceoven pricedwasher pricewasher"
egen appliances_spend=rowtotal(`appliances')
 
*2. generate bundle of spending on major appliances plus TV and computer spending (also compare to ces_appliances, as that may include computers/TVs)
local appliances_TV_computer "pricefridge priceoven pricedwasher pricewasher pricetv pricecomputer"
egen appliances_TV_computer_spend=rowtotal(`appliances_TV_computer')

*3. generate bundle of spending on major appliances plus combined spending on furniture, floor coverings, small appliances and miscellaneous housewares (denoted just "furniture" in our data)
*compare this bundle to (similar) CES sum of major appliances plus furniture, rugs, small appliances and misc household equipment (defined as ces_appliances_furnishings above)
local appliances_furnishings "pricefridge priceoven pricedwasher pricewasher furniture"
egen appliances_furnishings_spend=rowtotal(`appliances_furnishings')

tempfile dur
save `dur'

********************************************************************************
* CREATE FIGURE
********************************************************************************

** using the durables sample, we want to generate the mean (or median) durable goods spending (as well as other bundles) per quarter (indicated by "quarter" variable); include zero spending values;

use `dur', clear
collapse (mean) durables appliances_spend appliances_TV_computer_spend appliances_furnishings_spend if durables_sample_hp==1 [pweight=weight_samp], by(quarter)
gen year = yofd(dofq(quarter))
**determine weight variables for the 4th quarter of 2009 and 2011 using the average weight in 2010 and 2012
bysort year (quarter): egen yearly_spending = total(durables)
gen ratio_spending = durables/yearly_spending
egen weight_temp = total(ratio_spending) if quarter == tq(2010q4) | quarter == tq(2012q4)
egen weight = min(weight_temp)
replace weight = weight/2 //average of the two quarter weights that we care about
drop weight_temp
collapse (sum) durables appliances* (last) weight, by(year)
replace durables = (1/(weight))*durables if year == 2009
replace appliances_spend = (1/(weight))*appliances_spend if year == 2009
replace appliances_TV_computer_spend = (1/(weight))*appliances_TV_computer_spend if year == 2009
replace appliances_furnishings_spend = (1/(weight))*appliances_furnishings_spend if year == 2009
replace durables = (1/(1-weight))*durables if year == 2011
replace appliances_spend = (1/(1-weight))*appliances_spend if year == 2011
replace appliances_TV_computer_spend = (1/(1-weight))*appliances_TV_computer_spend if year == 2011
replace appliances_furnishings_spend = (1/(1-weight))*appliances_furnishings_spend if year == 2011
tsset year
tempfile dur_year 
save `dur_year'

** Now generate graphs showing our sample mean appliances (+ others?) spending (by year) against the comparison spending data

use `haverd', clear
rename time year
merge 1:1 year using `dur_year', gen(havermatch)

sort year
keep if havermatch==3

gen ratio_alp_ces=durables/ces_appliances_furnishings
egen ave_ratio_alp_ces=mean(ratio_alp_ces)

la var durables "Durable Goods Spending, Regression Sample (Nominal)"
la var appliances_spend "Major Appliances Spending, Regression Sample"
la var appliances_TV_computer_spend "Major Appliances Plus TV and Computer Spending, Regression Sample"
la var appliances_furnishings_spend "Major Appliances Plus Household Furnishings Spending, Regression Sample" 
la var ces_appliances "Major Appliances Spending, Consumer Expenditure Survey"
la var ces_appliances_furnishings "Major Appliances Plus Household Furnishings Spending, Consumer Expenditure Survey"
la var ces_appliances_furnishings_av "Major Appliances Plus Household Household Furnishings and A/V Spending, Consumer Expenditure Survey"

tw scatter durables ces_appliances_furnishings /*ces_appliances_furnishings_av*/ year if year >= 2009 & year <=2012, ///
	/*title("Durable Goods Spending:" "Regression Sample vs. Consumer Expenditure Survey", size(medsmall))*/ ///
	ylab(800(200)1600, ang(h) labsize(medlarge)) xlab(, labsize(medlarge)) ytitle("Expenditure ({c S|})", size(medlarge)) ///
	legend(rows(3) size(medlarge) order(1 "Durable Goods Spending, RAND-ALP" 2 "Durable Goods Spending, CES") region(lstyle(none))) xtitle("") ///
	msize(large large large)
	graph save "../Figures/1a.gph", replace
graph export ../Figures/figure_1a.png, as(png) replace

set scheme s1mono
tw scatter durables ces_appliances_furnishings /*ces_appliances_furnishings_av*/ year if year >= 2009 & year <=2012, ///
	/*title("Durable Goods Spending:" "Regression Sample vs. Consumer Expenditure Survey", size(medsmall))*/ ///
	ylab(800(200)1600, ang(h) labsize(medlarge)) xlab(, labsize(medlarge)) ytitle("Expenditure ({c S|})", size(medlarge)) ///
	legend(rows(3) size(medlarge) order(1 "Durable Goods Spending, RAND-ALP" 2 "Durable Goods Spending, CES") region(lstyle(none))) xtitle("") ///
	msize(large large large)
	graph save "../Figures/1a_bw.gph", replace
graph export ../Figures/figure_1a_bw.png, as(png) replace
